if (!require("pacman"))
install.packages("pacman")
# use this line for installing/loading
# pacman::p_load()
devtools::install_github("tidyverse/dsbox")HW 01
0 - Setup
1 - Road traffic accidents in Edinburgh
# load packages
library(tidyverse)
library(here)
library(countdown)
# set theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))
# set width of code output
options(width = 65)
# set figure parameters for knitr
knitr::opts_chunk$set(
fig.width = 7, # 7" width
fig.asp = 0.618, # the golden ratio
fig.retina = 3, # dpi multiplier for displaying HTML output on retina
fig.align = "center", # center align figures
dpi = 300 # higher dpi, sharper image
)
#load the dataset
accidents <- read_csv("~/Documents/Meredith Documents/1 Mer docs/1 PhD/5 SPRING 2025/INFO 526 DATA VISUALIZATION/HOMEWORK/hw_01_merjb/data/accidents.csv")
accidents |> glimpse()Rows: 768
Columns: 31
$ id <chr> "2018950000002", "2018950000006", "2…
$ easting <dbl> 327174, 324874, 330500, 321890, 3201…
$ northing <dbl> 670941, 672457, 671750, 671640, 6693…
$ longitude <dbl> -3.167032, -3.204252, -3.114026, -3.…
$ latitude <dbl> 55.92600, 55.93926, 55.93376, 55.931…
$ police_force <dbl> 95, 95, 95, 95, 95, 95, 95, 95, 95, …
$ severity <chr> "Slight", "Slight", "Slight", "Sligh…
$ vehicles <dbl> 1, 1, 2, 3, 2, 3, 1, 1, 1, 2, 2, 1, …
$ casualties <dbl> 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, …
$ date <chr> "31/12/2018", "30/12/2018", "03/01/2…
$ day_of_week <chr> "Monday", "Sunday", "Wednesday", "Mo…
$ time <time> 14:59:00, 12:50:00, 14:34:00, 02:25…
$ district <dbl> 923, 923, 923, 923, 923, 923, 923, 9…
$ highway <chr> "S12000036", "S12000036", "S12000036…
$ first_road_class <chr> "Unclassified", "Unclassified", "A(M…
$ first_road_number <dbl> 0, 0, 6095, 71, 0, 720, 0, 0, 1, 700…
$ road_type <chr> "Single carriageway", "Single carria…
$ speed_limit <dbl> 20, 20, 20, 30, 30, 70, 20, 30, 20, …
$ junction_detail <chr> "Other junction", "Other junction", …
$ junction_control <chr> "Give way or uncontrolled", "Give wa…
$ second_road_class <chr> "Unclassified", "Missing / Out of ra…
$ second_road_number <dbl> 0, -1, 6106, 0, 0, 720, 0, 0, 0, 700…
$ ped_cross_human <chr> "None within 50 metres", "None withi…
$ ped_cross_physical <chr> "Pedestrian phase at traffic signal …
$ light <chr> "Daylight", "Daylight", "Daylight", …
$ weather <chr> "Fine + no high winds", "Fine + no h…
$ road_surface <chr> "Dry", "Dry", "Wet or damp", "Wet or…
$ special_condition <chr> "None", "None", "None", "None", "Non…
$ hazard <chr> "None", "None", "None", "None", "Non…
$ urban_rural <dbl> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
$ police <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "…
#mutate day_of_the_week to weekend/ weekday
accidents_wrangle <- accidents |>
mutate(
weekend = day_of_week == "Saturday", "Sunday") |>
mutate(
weekday = day_of_week == "Monday", "Tuesday", "Wednesday", "Thursday", "Friday"
)|>
mutate(
slight = severity == "Slight",
severe = severity == "Severe",
fatal = severity == "Fatal"
)
accidents_wrangle |>glimpse()Rows: 768
Columns: 41
$ id <chr> "2018950000002", "2018950000006", "2…
$ easting <dbl> 327174, 324874, 330500, 321890, 3201…
$ northing <dbl> 670941, 672457, 671750, 671640, 6693…
$ longitude <dbl> -3.167032, -3.204252, -3.114026, -3.…
$ latitude <dbl> 55.92600, 55.93926, 55.93376, 55.931…
$ police_force <dbl> 95, 95, 95, 95, 95, 95, 95, 95, 95, …
$ severity <chr> "Slight", "Slight", "Slight", "Sligh…
$ vehicles <dbl> 1, 1, 2, 3, 2, 3, 1, 1, 1, 2, 2, 1, …
$ casualties <dbl> 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, …
$ date <chr> "31/12/2018", "30/12/2018", "03/01/2…
$ day_of_week <chr> "Monday", "Sunday", "Wednesday", "Mo…
$ time <time> 14:59:00, 12:50:00, 14:34:00, 02:25…
$ district <dbl> 923, 923, 923, 923, 923, 923, 923, 9…
$ highway <chr> "S12000036", "S12000036", "S12000036…
$ first_road_class <chr> "Unclassified", "Unclassified", "A(M…
$ first_road_number <dbl> 0, 0, 6095, 71, 0, 720, 0, 0, 1, 700…
$ road_type <chr> "Single carriageway", "Single carria…
$ speed_limit <dbl> 20, 20, 20, 30, 30, 70, 20, 30, 20, …
$ junction_detail <chr> "Other junction", "Other junction", …
$ junction_control <chr> "Give way or uncontrolled", "Give wa…
$ second_road_class <chr> "Unclassified", "Missing / Out of ra…
$ second_road_number <dbl> 0, -1, 6106, 0, 0, 720, 0, 0, 0, 700…
$ ped_cross_human <chr> "None within 50 metres", "None withi…
$ ped_cross_physical <chr> "Pedestrian phase at traffic signal …
$ light <chr> "Daylight", "Daylight", "Daylight", …
$ weather <chr> "Fine + no high winds", "Fine + no h…
$ road_surface <chr> "Dry", "Dry", "Wet or damp", "Wet or…
$ special_condition <chr> "None", "None", "None", "None", "Non…
$ hazard <chr> "None", "None", "None", "None", "Non…
$ urban_rural <dbl> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
$ police <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "…
$ weekend <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ `"Sunday"` <chr> "Sunday", "Sunday", "Sunday", "Sunda…
$ weekday <lgl> TRUE, FALSE, FALSE, TRUE, FALSE, FAL…
$ `"Tuesday"` <chr> "Tuesday", "Tuesday", "Tuesday", "Tu…
$ `"Wednesday"` <chr> "Wednesday", "Wednesday", "Wednesday…
$ `"Thursday"` <chr> "Thursday", "Thursday", "Thursday", …
$ `"Friday"` <chr> "Friday", "Friday", "Friday", "Frida…
$ slight <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, …
$ severe <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
$ fatal <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, F…
# Make plot
accidents_wrangle |>
ggplot(aes(time,fill = severity))+
geom_density(alpha = 0.5)+
theme_light()+
scale_fill_manual(values = c("purple","deepskyblue3","yellow")) +
labs(
x = "Time of day",
y = "Density",
title = "Number of accidents throughout the day",
subtitle = "By day of week and severity",
caption = "Source: Road traffic accidents in Edinburgh, 2018,\n R Programming 101: Create a boxplot using R programming with the ggplot package")2 - NYC marathon winners
# load packages
library(tidyverse)
library(here)
library(countdown)
# set theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))
# set width of code output
options(width = 65)
# set figure parameters for knitr
knitr::opts_chunk$set(
fig.width = 7, # 7" width
fig.asp = 0.618, # the golden ratio
fig.retina = 3, # dpi multiplier for displaying HTML output on retina
fig.align = "center", # center align figures
dpi = 300 # higher dpi, sharper image
)
# load the dataset
nyc_marathon <- read_csv("~/Documents/Meredith Documents/1 Mer docs/1 PhD/5 SPRING 2025/INFO 526 DATA VISUALIZATION/HOMEWORK/hw_01_merjb/data/nyc_marathon.csv")
# a. Histogram
nyc_marathon|>
drop_na(time_hrs) |>
ggplot(aes(time_hrs)) +
geom_histogram(binwidth = 0.02, fill = "cornsilk4")+
theme_minimal()+
labs(
x = "Times",
y = NULL,
title = "NYC Marathon Winners 1970 - 2020",
caption = "Source: OpenIntro 25.0 https://openintrostat.github.io/openintro/reference/nyc_marathon.html")# This histogram is skewed to the right. There appear to be two peaks of times and the data is not normally distributed.
# a. Boxplot
nyc_marathon|>
drop_na(time_hrs) |>
ggplot(aes(,time_hrs)) +
geom_boxplot() +
theme_classic() +
stat_boxplot(geom = 'errorbar', width = 0.2)+
labs(
title = "NYC Marathon Winners 1970 - 2020",
x = NULL,
y = "Times",
caption = "Source: OpenIntro 25.0 https://openintrostat.github.io/openintro/reference/nyc_marathon.html")# The median for this boxplot is ~2.40 hrs, with the 25%ile at ~2.15 hrs and the 75%ile at ~2.48 hrs and some outliers at over 2.85hrs and >3hrs. We cannot observe this level of data in the histogram.
# b. Boxplot by Gender
nyc_marathon|>
drop_na(time_hrs) |>
ggplot(aes(division, time_hrs, fill = division)) +
geom_boxplot() +
theme_classic() +
scale_fill_manual(values = c("cornsilk4","deepskyblue3")) +
stat_boxplot(geom = 'errorbar', width = 0.2)+
labs(
y = "Times",
x = "Division",
title = "NYC Marathon Winners 1970 - 2020",
caption = "Source: OpenIntro 25.0 \nhttps://openintrostat.github.io/openintro/reference/nyc_marathon.html")+
theme(
legend.position = c(0.9, 0.9),
legend.box.background = element_rect(fill = "white",
color = "white"),
)# This boxplot is now much more clear in terms of race times by gender. It is clear that the marathon times differ between Men and Women.
# c. Boxplot updated
nyc_marathon|>
drop_na(time_hrs) |>
ggplot(aes(division, time_hrs, fill = division)) +
geom_boxplot() +
coord_flip()+
theme_classic() +
scale_fill_manual(values = c("cornsilk4","deepskyblue3")) +
stat_boxplot(geom = 'errorbar', width = 0.2)+
labs(
y = "Times",
x = "Division",
title = "NYC Marathon Winners 1970 - 2020",
caption = "Source: OpenIntro 25.0 \nhttps://openintrostat.github.io/openintro/reference/nyc_marathon.html")+
theme(
legend.position = c(0.9, 0.9),
legend.box.background = element_rect(fill = "white",
color = "white"),
)#There was not much redundancy in this second iteration of the boxplot, however to update change the data to ink ratio, if we look at the data vertically, it becomes easier to read.
# d. Marathon times over the years by gender
nyc_marathon|>
drop_na(time_hrs) |>
ggplot(aes(year, time_hrs, color = division)) +
geom_point(show.legend = FALSE) +
geom_line(linewidth = 1) +
theme_classic() +
scale_color_manual(values = c("cornsilk4","deepskyblue3")) +
labs(
y = "Time",
x = "Year",
title = "NYC Marathon Winners 1970 - 2020",
caption = "Source: OpenIntro 25.0 \nhttps://openintrostat.github.io/openintro/reference/nyc_marathon.html")+
theme(
legend.position = c(0.9, 0.9),
legend.box.background = element_rect(fill = "white",
color = "white"),
)# In this line graph, we see that over the years, both men and women have gotten faster, however it is clear that Men run faster times than women and that the lines never cross nor overlap. Additionally we see that in 2020, the marathon time was slower than the past 4 decades, likely due to the virtual events held due to the Covid-19 pandemic.3 - US counties
#Source: R/data-county.R
#ggplot(county) +
# geom_point(aes(x = median_edu, y = median_hh_income)) +
#geom_boxplot(aes(x = smoking_ban, y = pop2017))